suic_dataset <- read.csv('master.csv', strip.white = TRUE)
suic_dataset <- suic_dataset[, -8]
str(suic_dataset)
## 'data.frame': 27820 obs. of 11 variables:
## $ country : Factor w/ 101 levels "Albania","Antigua and Barbuda",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ year : int 1987 1987 1987 1987 1987 1987 1987 1987 1987 1987 ...
## $ sex : Factor w/ 2 levels "female","male": 2 2 1 2 2 1 1 1 2 1 ...
## $ age : Factor w/ 6 levels "15-24 years",..: 1 3 1 6 2 6 3 2 5 4 ...
## $ suicides_no : int 21 16 14 1 9 1 6 4 1 0 ...
## $ population : int 312900 308000 289700 21800 274300 35600 278800 257200 137500 311000 ...
## $ suicides.100k.pop : num 6.71 5.19 4.83 4.59 3.28 2.81 2.15 1.56 0.73 0 ...
## $ HDI.for.year : num NA NA NA NA NA NA NA NA NA NA ...
## $ gdp_for_year.... : Factor w/ 2321 levels "1,002,219,052,968",..: 727 727 727 727 727 727 727 727 727 727 ...
## $ gdp_per_capita....: int 796 796 796 796 796 796 796 796 796 796 ...
## $ generation : Factor w/ 6 levels "Boomers","G.I. Generation",..: 3 6 3 2 1 2 6 1 2 3 ...
head(suic_dataset, 10)
## country year sex age suicides_no population
## 1 Albania 1987 male 15-24 years 21 312900
## 2 Albania 1987 male 35-54 years 16 308000
## 3 Albania 1987 female 15-24 years 14 289700
## 4 Albania 1987 male 75+ years 1 21800
## 5 Albania 1987 male 25-34 years 9 274300
## 6 Albania 1987 female 75+ years 1 35600
## 7 Albania 1987 female 35-54 years 6 278800
## 8 Albania 1987 female 25-34 years 4 257200
## 9 Albania 1987 male 55-74 years 1 137500
## 10 Albania 1987 female 5-14 years 0 311000
## suicides.100k.pop HDI.for.year gdp_for_year.... gdp_per_capita....
## 1 6.71 NA 2,156,624,900 796
## 2 5.19 NA 2,156,624,900 796
## 3 4.83 NA 2,156,624,900 796
## 4 4.59 NA 2,156,624,900 796
## 5 3.28 NA 2,156,624,900 796
## 6 2.81 NA 2,156,624,900 796
## 7 2.15 NA 2,156,624,900 796
## 8 1.56 NA 2,156,624,900 796
## 9 0.73 NA 2,156,624,900 796
## 10 0.00 NA 2,156,624,900 796
## generation
## 1 Generation X
## 2 Silent
## 3 Generation X
## 4 G.I. Generation
## 5 Boomers
## 6 G.I. Generation
## 7 Silent
## 8 Boomers
## 9 G.I. Generation
## 10 Generation X
sapply(suic_dataset, function(x) all(is.na(x) || is.infinite(x)))
## country year sex
## FALSE FALSE FALSE
## age suicides_no population
## FALSE FALSE FALSE
## suicides.100k.pop HDI.for.year gdp_for_year....
## FALSE TRUE FALSE
## gdp_per_capita.... generation
## FALSE FALSE
temp <- suic_dataset
new_colnames <- c('country', 'year', 'sex', 'age', 'suicidect', 'pop', 'suiciderate', 'hdiyear', 'gdpyear', 'gdpcapita', 'gen')
colnames(temp) = new_colnames
custom_factor <- function(cols){
for(i in cols){
level_c <- levels(temp[[i]])
label_c <- c()
for(j in 0:length(level_c)) label_c[j] <- j
temp[[i]] <<- factor(temp[[i]], labels = label_c, levels = level_c)
}
}
cols_ <- colnames(temp[, sapply(temp, is.factor)])
cols_ <- cols_[!cols_ %in% c('country', 'gdpcapita', 'age')]
custom_factor(cols_)
agelevels <- levels(temp$age)
agelevels
## [1] "15-24 years" "25-34 years" "35-54 years" "5-14 years" "55-74 years"
## [6] "75+ years"
agelabels <- c('gen_X', 'boomers', 'silent', 'gen_X', 'gen_GI', 'gen_GI')
temp$age <- factor(temp$age, levels = levels(temp$age), labels = c('gen_X', 'boomers', 'silent', 'gen_X', 'gen_GI', 'gen_GI'))
temp$gdpyear <- as.numeric(gsub(",","",levels(suic_dataset$gdp_for_year....), fixed = TRUE))[suic_dataset$gdp_for_year....]
temp$suiciderate <- (temp$suicidect / (temp$pop / 100000))
temp <- temp[, -11]
temp <- temp[!colnames(temp) %in% c('hdiyear')]
summary(temp)
## country year sex age
## Austria : 382 Min. :1985 1:13910 gen_X :9252
## Iceland : 382 1st Qu.:1995 2:13910 boomers:4642
## Mauritius : 382 Median :2002 silent :4642
## Netherlands: 382 Mean :2001 gen_GI :9284
## Argentina : 372 3rd Qu.:2008
## Belgium : 372 Max. :2016
## (Other) :25548
## suicidect pop suiciderate
## Min. : 0.0 Min. : 278 Min. : 0.0000
## 1st Qu.: 3.0 1st Qu.: 97498 1st Qu.: 0.9187
## Median : 25.0 Median : 430150 Median : 5.9909
## Mean : 242.6 Mean : 1844794 Mean : 12.8161
## 3rd Qu.: 131.0 3rd Qu.: 1486143 3rd Qu.: 16.6177
## Max. :22338.0 Max. :43805214 Max. :224.9719
##
## gdpyear gdpcapita
## Min. :4.692e+07 Min. : 251
## 1st Qu.:8.985e+09 1st Qu.: 3447
## Median :4.811e+10 Median : 9372
## Mean :4.456e+11 Mean : 16866
## 3rd Qu.:2.602e+11 3rd Qu.: 24874
## Max. :1.812e+13 Max. :126352
##
library(reshape)
totals <- function(data){
tl_df <- data.frame(NA, NA, NA, NA)
names(tl_df) <- c("country","year","total_pop", "total_suicide_ct")
tl_df <- na.omit(tl_df)
for(i in unique(data$country)){
for(j in unique(data$year)){
tdf <- data.frame(i, j, sum(data[data$country == i & data$year == j, 'pop']),
sum(data[data$country == i & data$year == j, 'suicidect']))
names(tdf) <- c("country", "year","total_pop", "total_suicide_ct")
tl_df <- rbind(tl_df, tdf)
}
}
country <- unique(tl_df$country)
tl_df <- reshape(tl_df, direction = "wide", idvar = c("country"), timevar = "year")
rownames(tl_df) <- unique(data$country)
return(tl_df[-1])
}
df <- totals(temp)
head(df,1)
## total_pop.1987 total_suicide_ct.1987 total_pop.1988
## Albania 2709600 73 2764300
## total_suicide_ct.1988 total_pop.1989 total_suicide_ct.1989
## Albania 63 2803100 68
## total_pop.1992 total_suicide_ct.1992 total_pop.1993
## Albania 2822500 47 2807300
## total_suicide_ct.1993 total_pop.1994 total_suicide_ct.1994
## Albania 73 2849300 50
## total_pop.1995 total_suicide_ct.1995 total_pop.1996
## Albania 2903400 88 2940200
## total_suicide_ct.1996 total_pop.1997 total_suicide_ct.1997
## Albania 89 2977300 170
## total_pop.1998 total_suicide_ct.1998 total_pop.1999
## Albania 3012700 154 3029700
## total_suicide_ct.1999 total_pop.2000 total_suicide_ct.2000
## Albania 139 2796300 54
## total_pop.2001 total_suicide_ct.2001 total_pop.2002
## Albania 2799349 119 2818839
## total_suicide_ct.2002 total_pop.2003 total_suicide_ct.2003
## Albania 133 2843929 124
## total_pop.2004 total_suicide_ct.2004 total_pop.2005
## Albania 2874991 146 2783320
## total_suicide_ct.2005 total_pop.2006 total_suicide_ct.2006
## Albania 0 2780176 0
## total_pop.2007 total_suicide_ct.2007 total_pop.2008
## Albania 2770344 124 2757059
## total_suicide_ct.2008 total_pop.2009 total_suicide_ct.2009
## Albania 160 2745735 0
## total_pop.2010 total_suicide_ct.2010 total_pop.1985
## Albania 2736025 96 0
## total_suicide_ct.1985 total_pop.1986 total_suicide_ct.1986
## Albania 0 0 0
## total_pop.1990 total_suicide_ct.1990 total_pop.1991
## Albania 0 0 0
## total_suicide_ct.1991 total_pop.2012 total_suicide_ct.2012
## Albania 0 0 0
## total_pop.2013 total_suicide_ct.2013 total_pop.2014
## Albania 0 0 0
## total_suicide_ct.2014 total_pop.2015 total_suicide_ct.2015
## Albania 0 0 0
## total_pop.2011 total_suicide_ct.2011 total_pop.2016
## Albania 0 0 0
## total_suicide_ct.2016
## Albania 0
total_df <- data.frame(NA, NA, NA, NA)
names(total_df) <- c("total_pop", "total_suicide_ct")
total_df <- na.omit(total_df)
for(i in unique(temp$country)){
tdf <- data.frame(sum(temp[temp$country == i, 'pop']) ,sum(temp[temp$country == i, 'suicidect']))
colnames(tdf) <- c("total_pop", "total_suicide_ct")
total_df <- rbind(total_df, tdf)
}
rm(tdf)
rm(i)
rm(j)
## Warning in rm(j): object 'j' not found
rownames(total_df) <- unique(temp$country)
df <- cbind(df, total_df)
scaled_df <- data.frame(scale(df))
head(scaled_df,1)
## total_pop.1987 total_suicide_ct.1987 total_pop.1988
## Albania -0.2698369 -0.2838701 -0.2533746
## total_suicide_ct.1988 total_pop.1989 total_suicide_ct.1989
## Albania -0.2795608 -0.2826843 -0.2809674
## total_pop.1992 total_suicide_ct.1992 total_pop.1993
## Albania -0.3660521 -0.3358863 -0.3537427
## total_suicide_ct.1993 total_pop.1994 total_suicide_ct.1994
## Albania -0.3087522 -0.3551332 -0.3062697
## total_pop.1995 total_suicide_ct.1995 total_pop.1996
## Albania -0.3629368 -0.3186826 -0.3776569
## total_suicide_ct.1996 total_pop.1997 total_suicide_ct.1997
## Albania -0.33355 -0.3819605 -0.3226696
## total_pop.1998 total_suicide_ct.1998 total_pop.1999
## Albania -0.3836518 -0.3350752 -0.3936341
## total_suicide_ct.1999 total_pop.2000 total_suicide_ct.2000
## Albania -0.3319577 -0.3986281 -0.3462654
## total_pop.2001 total_suicide_ct.2001 total_pop.2002
## Albania -0.3854671 -0.3290675 -0.3966008
## total_suicide_ct.2002 total_pop.2003 total_suicide_ct.2003
## Albania -0.3381362 -0.3964685 -0.3468724
## total_pop.2004 total_suicide_ct.2004 total_pop.2005
## Albania -0.3710656 -0.3315149 -0.3675342
## total_suicide_ct.2005 total_pop.2006 total_suicide_ct.2006
## Albania -0.354213 -0.38778 -0.3663077
## total_pop.2007 total_suicide_ct.2007 total_pop.2008
## Albania -0.3908419 -0.3454446 -0.388568
## total_suicide_ct.2008 total_pop.2009 total_suicide_ct.2009
## Albania -0.3492111 -0.4122316 -0.3830215
## total_pop.2010 total_suicide_ct.2010 total_pop.1985
## Albania -0.4139084 -0.371006 -0.340438
## total_suicide_ct.1985 total_pop.1986 total_suicide_ct.1986
## Albania -0.2876006 -0.344379 -0.2832239
## total_pop.1990 total_suicide_ct.1990 total_pop.1991
## Albania -0.428922 -0.3419059 -0.4308832
## total_suicide_ct.1991 total_pop.2012 total_suicide_ct.2012
## Albania -0.3493309 -0.4571275 -0.3847174
## total_pop.2013 total_suicide_ct.2013 total_pop.2014
## Albania -0.4491358 -0.3760499 -0.4507231
## total_suicide_ct.2014 total_pop.2015 total_suicide_ct.2015
## Albania -0.3780522 -0.4138309 -0.3459816
## total_pop.2011 total_suicide_ct.2011 total_pop.2016
## Albania -0.475017 -0.3864962 -0.2220068
## total_suicide_ct.2016 total_pop total_suicide_ct
## Albania -0.2783145 -0.3970021 -0.3559822
library(factoextra)
distance <- get_dist(scaled_df[, 65:66])
fviz_dist(distance, show_labels = TRUE, lab_size = NULL,
gradient = list(low = "blue", below_average = "yellow", average = "green", high = "red", extremely_high = "black"))
Taking 6 clusters as starting point
Animation of steps to classify data in clusters
library(animation)
library(heatmap3)
kmeans.ani(scaled_df[, 65:66], centers = 4)
km <- kmeans(scaled_df[, 65:66], centers = 4, nstart = 30)
str(km)
## List of 9
## $ cluster : Named int [1:101] 3 3 4 3 3 4 3 3 3 3 ...
## ..- attr(*, "names")= chr [1:101] "Albania" "Antigua and Barbuda" "Argentina" "Armenia" ...
## $ centers : num [1:4, 1:2] 4.126 1.487 -0.358 0.364 5.216 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:4] "1" "2" "3" "4"
## .. ..$ : chr [1:2] "total_pop" "total_suicide_ct"
## $ totss : num 200
## $ withinss : num [1:4] 12.55 9.34 1.7 1.4
## $ tot.withinss: num 25
## $ betweenss : num 175
## $ size : int [1:4] 3 8 79 11
## $ iter : int 3
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
km
## K-means clustering with 4 clusters of sizes 3, 8, 79, 11
##
## Cluster means:
## total_pop total_suicide_ct
## 1 4.1264518 5.21555114
## 2 1.4865704 0.87433520
## 3 -0.3578772 -0.29569613
## 4 0.3636707 0.06533262
##
## Clustering vector:
## Albania Antigua and Barbuda
## 3 3
## Argentina Armenia
## 4 3
## Aruba Australia
## 3 4
## Austria Azerbaijan
## 3 3
## Bahamas Bahrain
## 3 3
## Barbados Belarus
## 3 3
## Belgium Belize
## 3 3
## Bosnia and Herzegovina Brazil
## 3 2
## Bulgaria Cabo Verde
## 3 3
## Canada Chile
## 4 3
## Colombia Costa Rica
## 4 3
## Croatia Cuba
## 3 3
## Cyprus Czech Republic
## 3 3
## Denmark Dominica
## 3 3
## Ecuador El Salvador
## 3 3
## Estonia Fiji
## 3 3
## Finland France
## 3 2
## Georgia Germany
## 3 2
## Greece Grenada
## 3 3
## Guatemala Guyana
## 3 3
## Hungary Iceland
## 3 3
## Ireland Israel
## 3 3
## Italy Jamaica
## 2 3
## Japan Kazakhstan
## 1 4
## Kiribati Kuwait
## 3 3
## Kyrgyzstan Latvia
## 3 3
## Lithuania Luxembourg
## 3 3
## Macau Maldives
## 3 3
## Malta Mauritius
## 3 3
## Mexico Mongolia
## 2 3
## Montenegro Netherlands
## 3 3
## New Zealand Nicaragua
## 3 3
## Norway Oman
## 3 3
## Panama Paraguay
## 3 3
## Philippines Poland
## 4 4
## Portugal Puerto Rico
## 3 3
## Qatar Republic of Korea
## 3 2
## Romania Russian Federation
## 4 1
## Saint Kitts and Nevis Saint Lucia
## 3 3
## Saint Vincent and Grenadines San Marino
## 3 3
## Serbia Seychelles
## 3 3
## Singapore Slovakia
## 3 3
## Slovenia South Africa
## 3 4
## Spain Sri Lanka
## 4 3
## Suriname Sweden
## 3 3
## Switzerland Thailand
## 3 4
## Trinidad and Tobago Turkey
## 3 3
## Turkmenistan Ukraine
## 3 2
## United Arab Emirates United Kingdom
## 3 2
## United States Uruguay
## 1 3
## Uzbekistan
## 3
##
## Within cluster sum of squares by cluster:
## [1] 12.545821 9.338014 1.701099 1.404262
## (between_SS / total_SS = 87.5 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
fviz_cluster(km, data = scaled_df[, 65:66])
heatmap3(matrix(scaled_df$total_suicide_ct, km$cluster))
#Elbow Method
set.seed(300)
fviz_nbclust(scaled_df[, 65:66], kmeans, method="wss") + geom_vline(xintercept = 4, linetype = 2)+
labs(subtitle = "Elbow method")
#Silhouette Method
set.seed(300)
fviz_nbclust(scaled_df[, 65:66], kmeans, method="silhouette") + labs(subtitle = "Silhouette method")
#Gap Method
library(cluster)
gap_stat <- clusGap(scaled_df[, 65:66], FUN = kmeans, nstart = 30, K.max = 6, B = 50)
set.seed(300)
fviz_gap_stat(gap_stat) + labs(subtitle = "Gap-Statisitc method")
library(NbClust)
NbClust(scaled_df[, 65:66], distance = "euclidean", method = "kmeans")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 8 proposed 2 as the best number of clusters
## * 2 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 1 proposed 5 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 1 proposed 10 as the best number of clusters
## * 1 proposed 12 as the best number of clusters
## * 1 proposed 13 as the best number of clusters
## * 8 proposed 14 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 2
##
##
## *******************************************************************
## $All.index
## KL CH Hartigan CCC Scott Marriot TrCovW TraceW
## 2 2.6117 230.0293 99.8983 0.1649 144.6672 2842.5042 263.2695 60.1770
## 3 3.0197 278.1834 19.4650 1.4248 300.3264 1369.4856 222.9970 29.9526
## 4 1.2651 226.4452 34.7739 0.2665 338.1373 1674.3658 202.0378 24.9892
## 5 0.8136 236.9435 8.0957 0.9380 376.0461 1797.4806 93.8094 18.3948
## 6 0.6317 205.0013 1.1610 -0.4705 394.5005 2156.1259 87.8465 16.9642
## 7 1.0148 171.2935 1.0316 -2.2689 398.3723 2824.3539 88.3630 16.7594
## 8 0.9360 147.0010 0.1421 -3.8157 400.4381 3614.2657 87.8423 16.5774
## 9 2.4763 127.4548 14.9629 -5.2720 400.8860 4554.0638 87.7786 16.5521
## 10 0.4422 131.9320 -0.4015 -5.0019 454.8571 3294.8926 64.0149 14.2367
## 11 1.3235 116.8761 1.6115 -6.2722 453.7221 4031.8754 61.7071 14.2998
## 12 0.8162 107.0967 0.1103 -7.2171 460.3015 4495.6576 62.9542 14.0482
## 13 0.1085 97.1983 226.1515 -8.2603 460.8786 5246.0919 62.9249 14.0308
## 14 33.3059 333.8559 0.1565 3.9755 689.0486 635.4579 4.6682 3.9303
## 15 0.9876 307.0081 0.0698 3.0120 689.5247 726.0493 4.6483 3.9233
## Friedman Rubin Cindex DB Silhouette Duda Pseudot2 Beale
## 2 5.9240 3.3235 0.0972 0.5257 0.8853 0.8758 1.5599 0.0945
## 3 17.6053 6.6772 0.0700 0.6505 0.7851 1.7443 -3.8404 -0.2134
## 4 21.8784 8.0035 0.0437 0.8294 0.6751 3.4670 -54.0788 -0.7024
## 5 23.5831 10.8726 0.0399 0.6515 0.6727 2.1755 -36.2020 -0.5332
## 6 26.3227 11.7895 0.0246 0.7002 0.6127 1.5507 -8.1675 -0.3044
## 7 27.0722 11.9336 0.0216 0.7550 0.5000 0.7522 5.9289 0.3088
## 8 27.3485 12.0646 0.0187 0.8516 0.5035 75.7689 -12.8284 -0.8458
## 9 27.4338 12.0830 0.0197 0.8879 0.4464 1.1682 -0.7199 -0.1309
## 10 41.2026 14.0482 0.0150 0.7763 0.5283 5.6490 -9.0528 -0.4115
## 11 40.8996 13.9862 0.0134 0.8108 0.4940 129.9308 -7.9384 -0.8505
## 12 42.9912 14.2367 0.0124 0.7490 0.5046 185.9366 -4.9731 -0.8289
## 13 43.1931 14.2543 0.0129 0.7208 0.4956 0.2742 2.6476 1.3238
## 14 119.2248 50.8865 0.0187 0.6635 0.5062 0.6686 6.9383 0.4575
## 15 119.5790 50.9781 0.0184 0.6862 0.4939 4.1354 -25.0201 -0.7311
## Ratkowsky Ball Ptbiserial Frey McClain Dunn Hubert SDindex
## 2 0.5912 30.0885 0.8842 12.3143 0.0070 0.3567 0.0104 213.9798
## 3 0.5318 9.9842 0.7221 8.7587 0.0265 0.0482 0.0101 123.8242
## 4 0.4673 6.2473 0.5953 8.4034 0.0456 0.0297 0.0101 98.5980
## 5 0.4261 3.6790 0.5714 9.2239 0.0498 0.0152 0.0102 67.8472
## 6 0.3905 2.8274 0.4307 26.3424 0.0850 0.0199 0.0103 61.8064
## 7 0.3617 2.3942 0.3011 3.5458 0.1970 0.0045 0.0104 69.5797
## 8 0.3385 2.0722 0.2821 -17.8328 0.2065 0.0031 0.0104 70.9716
## 9 0.3192 1.8391 0.2416 -4.7044 0.3113 0.0040 0.0104 86.4207
## 10 0.3047 1.4237 0.2944 14.8357 0.1549 0.0045 0.0103 60.7570
## 11 0.2905 1.3000 0.2406 0.5687 0.2277 0.0040 0.0104 83.5982
## 12 0.2783 1.1707 0.2396 -18.8482 0.2146 0.0040 0.0104 84.5265
## 13 0.2674 1.0793 0.2122 -0.1163 0.2942 0.0029 0.0104 119.7723
## 14 0.2646 0.2807 0.2144 19.8254 0.2306 0.0053 0.0104 98.2165
## 15 0.2557 0.2616 0.2041 -15.5083 0.2538 0.0049 0.0104 125.6693
## Dindex SDbw
## 2 0.4897 1.8469
## 3 0.3249 1.1073
## 4 0.2603 1.0653
## 5 0.2271 0.6556
## 6 0.1874 0.6059
## 7 0.1705 0.6416
## 8 0.1621 0.6213
## 9 0.1594 0.6833
## 10 0.1358 0.4410
## 11 0.1328 0.5230
## 12 0.1261 0.4546
## 13 0.1231 0.4488
## 14 0.0855 0.2194
## 15 0.0842 0.2339
##
## $All.CriticalValues
## CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 2 -0.5522 -30.9205 0.9118
## 3 -0.7431 -21.1118 1.0000
## 4 0.4040 112.0981 1.0000
## 5 0.4005 100.3069 1.0000
## 6 -0.1908 -143.5463 1.0000
## 7 0.0832 198.4289 0.7366
## 8 -0.1908 -81.1349 1.0000
## 9 -0.0307 -167.9011 1.0000
## 10 -0.7431 -25.8033 1.0000
## 11 -0.1908 -49.9292 1.0000
## 12 -0.2510 -24.9172 1.0000
## 13 -0.7431 -2.3458 0.4303
## 14 0.0222 618.0234 0.6383
## 15 0.2234 114.6927 1.0000
##
## $Best.nc
## KL CH Hartigan CCC Scott Marriot TrCovW
## Number_clusters 14.0000 14.0000 13.0000 14.0000 14.00 14.000 5.0000
## Value_Index 33.3059 333.8559 226.0412 3.9755 228.17 4701.225 108.2284
## TraceW Friedman Rubin Cindex DB Silhouette Duda
## Number_clusters 3.000 14.0000 14.0000 12.0000 2.0000 2.0000 2.0000
## Value_Index 25.261 76.0318 -36.5407 0.0124 0.5257 0.8853 0.8758
## PseudoT2 Beale Ratkowsky Ball PtBiserial Frey
## Number_clusters 4.0000 2.0000 2.0000 3.0000 2.0000 7.0000
## Value_Index -54.0788 0.0945 0.5912 20.1043 0.8842 3.5458
## McClain Dunn Hubert SDindex Dindex SDbw
## Number_clusters 2.000 2.0000 0 10.000 0 14.0000
## Value_Index 0.007 0.3567 0 60.757 0 0.2194
##
## $Best.partition
## Albania Antigua and Barbuda
## 2 2
## Argentina Armenia
## 2 2
## Aruba Australia
## 2 2
## Austria Azerbaijan
## 2 2
## Bahamas Bahrain
## 2 2
## Barbados Belarus
## 2 2
## Belgium Belize
## 2 2
## Bosnia and Herzegovina Brazil
## 2 1
## Bulgaria Cabo Verde
## 2 2
## Canada Chile
## 2 2
## Colombia Costa Rica
## 2 2
## Croatia Cuba
## 2 2
## Cyprus Czech Republic
## 2 2
## Denmark Dominica
## 2 2
## Ecuador El Salvador
## 2 2
## Estonia Fiji
## 2 2
## Finland France
## 2 2
## Georgia Germany
## 2 2
## Greece Grenada
## 2 2
## Guatemala Guyana
## 2 2
## Hungary Iceland
## 2 2
## Ireland Israel
## 2 2
## Italy Jamaica
## 2 2
## Japan Kazakhstan
## 1 2
## Kiribati Kuwait
## 2 2
## Kyrgyzstan Latvia
## 2 2
## Lithuania Luxembourg
## 2 2
## Macau Maldives
## 2 2
## Malta Mauritius
## 2 2
## Mexico Mongolia
## 2 2
## Montenegro Netherlands
## 2 2
## New Zealand Nicaragua
## 2 2
## Norway Oman
## 2 2
## Panama Paraguay
## 2 2
## Philippines Poland
## 2 2
## Portugal Puerto Rico
## 2 2
## Qatar Republic of Korea
## 2 2
## Romania Russian Federation
## 2 1
## Saint Kitts and Nevis Saint Lucia
## 2 2
## Saint Vincent and Grenadines San Marino
## 2 2
## Serbia Seychelles
## 2 2
## Singapore Slovakia
## 2 2
## Slovenia South Africa
## 2 2
## Spain Sri Lanka
## 2 2
## Suriname Sweden
## 2 2
## Switzerland Thailand
## 2 2
## Trinidad and Tobago Turkey
## 2 2
## Turkmenistan Ukraine
## 2 2
## United Arab Emirates United Kingdom
## 2 2
## United States Uruguay
## 1 2
## Uzbekistan
## 2
So we stick with 4 clusters and hence we have a robust model. Calculating goodness
goodness_km <- km$betweenss / km$totss
goodness_km
## [1] 0.875054
clusters_df <- data.frame(NA, NA)
colnames(clusters_df) <- c("country", "cluster")
clusters_df <- na.omit(clusters_df)
for(i in levels(factor(km$cluster))){
rnames <- rownames(df[km$cluster == i, ])
for(j in rnames){
tempdf <- data.frame(j, i)
colnames(tempdf) <- c("country", "cluster")
clusters_df <- rbind(clusters_df, tempdf)
}
}
rownames(clusters_df) <- clusters_df$country
clusters_df <- clusters_df[-1]
df <- merge(clusters_df,df, by='row.names', all=TRUE)
scaled_df <- merge(clusters_df, scaled_df, by="row.names", all = TRUE)
library(caTools)
library(dplyr)
set.seed(123)
n = nrow(scaled_df)
split = sample(c(TRUE, FALSE), n, replace=TRUE, prob=c(0.8, 0.2))
training_set <- scaled_df[split, c(1,2,67,68)]
test_set <- scaled_df[!split, c(1,2,67,68)]
str(test_set)
## 'data.frame': 18 obs. of 4 variables:
## $ Row.names : 'AsIs' chr "Armenia" "Aruba" "Azerbaijan" "Barbados" ...
## $ cluster : Factor w/ 4 levels "1","2","3","4": 3 3 3 3 2 3 4 3 3 3 ...
## $ total_pop : num -0.384 -0.451 -0.353 -0.447 3.872 ...
## $ total_suicide_ct: num -0.356 -0.366 -0.358 -0.366 0.877 ...
library(clue)
pred <- cl_predict(km, training_set[,3:4])
str(pred)
## 'cl_class_ids' int [1:83] 3 3 4 4 3 3 3 3 3 3 ...
result_df <- data.frame(cbind(original_cluster = training_set$cluster,
predicted_cluster = pred))
rownames(result_df) <- training_set$Row.names
head(result_df, 10)
## original_cluster predicted_cluster
## Albania 3 3
## Antigua and Barbuda 3 3
## Argentina 4 4
## Australia 4 4
## Austria 3 3
## Bahamas 3 3
## Bahrain 3 3
## Belarus 3 3
## Belgium 3 3
## Belize 3 3
correctly_predicted <- nrow(result_df[result_df$original_cluster == result_df$predicted_cluster, ])
accuracy_rate <- correctly_predicted / nrow(result_df)
accuracy_rate
## [1] 1